@Article{fi18020112, AUTHOR = {Roumeliotis, Konstantinos I. and Margaris, Dionisis and Spiliotopoulos, Dimitris and Vassilakis, Costas}, TITLE = {A Large-Scale Empirical Study of LLM Orchestration and Ensemble Strategies for Sentiment Analysis in Recommender Systems}, JOURNAL = {Future Internet}, VOLUME = {18}, YEAR = {2026}, NUMBER = {2}, ARTICLE-NUMBER = {112}, URL = {https://www.mdpi.com/1999-5903/18/2/112}, ISSN = {1999-5903}, ABSTRACT = {This paper presents a comprehensive empirical evaluation comparing meta-model aggregation strategies with traditional ensemble methods and standalone models for sentiment analysis in recommender systems beyond standalone large language model (LLM) performance. We investigate whether aggregating multiple LLMs through a reasoning-based meta-model provides measurable performance advantages over individual models and standard statistical aggregation approaches in zero-shot sentiment classification. Using a balanced dataset of 5000 verified Amazon purchase reviews (1000 reviews per rating category from 1 to 5 stars, sampled via two-stage stratified sampling across five product categories), we evaluate 12 different leading pre-trained LLMs from four major providers (OpenAI, Anthropic, Google, and DeepSeek) in both standalone and meta-model configurations. Our experimental design systematically compares individual model performance against GPT-based meta-model aggregation and traditional ensemble baselines (majority voting, mean aggregation). Results show statistically significant improvements (McNemar’s test, p < 0.001): the GPT-5 meta-model achieves 71.40% accuracy (10.15 percentage point improvement over the 61.25% individual model average), while the GPT-5 mini meta-model reaches 70.32% (9.07 percentage point improvement). These observed improvements surpass traditional ensemble methods (majority voting: 62.64%; mean aggregation: 62.96%), suggesting potential value in meta-model aggregation for sentiment analysis tasks. Our analysis reveals empirical patterns including neutral sentiment classification challenges (3-star ratings show 64.83% failure rates across models), model influence hierarchies, and cost-accuracy trade-offs ($130.45 aggregation cost vs. $0.24–$43.97 for individual models per 5000 predictions). This work provides evidence-based insights into the comparative effectiveness of LLM aggregation strategies in recommender systems, demonstrating that meta-model aggregation with natural language reasoning capabilities achieves measurable performance gains beyond statistical aggregation alone.}, DOI = {10.3390/fi18020112} }